#importação das bibliotecas iniciais
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
%matplotlib inline
#carregamento da base de dados da usina
geracao = pd.read_csv('dados\Plant_1_Generation_Data.csv')
clima = pd.read_csv('dados\Plant_1_Weather_Sensor_Data.csv')
#escopo geral da base de dados
geracao.head()
clima.head()
| DATE_TIME | PLANT_ID | SOURCE_KEY | AMBIENT_TEMPERATURE | MODULE_TEMPERATURE | IRRADIATION | |
|---|---|---|---|---|---|---|
| 0 | 2020-05-15 00:00:00 | 4135001 | HmiyD2TTLFNqkNe | 25.184316 | 22.857507 | 0.0 |
| 1 | 2020-05-15 00:15:00 | 4135001 | HmiyD2TTLFNqkNe | 25.084589 | 22.761668 | 0.0 |
| 2 | 2020-05-15 00:30:00 | 4135001 | HmiyD2TTLFNqkNe | 24.935753 | 22.592306 | 0.0 |
| 3 | 2020-05-15 00:45:00 | 4135001 | HmiyD2TTLFNqkNe | 24.846130 | 22.360852 | 0.0 |
| 4 | 2020-05-15 01:00:00 | 4135001 | HmiyD2TTLFNqkNe | 24.621525 | 22.165423 | 0.0 |
#descrição do comportamento das varáveis dos dados
geracao.describe()
clima.describe()
| PLANT_ID | AMBIENT_TEMPERATURE | MODULE_TEMPERATURE | IRRADIATION | |
|---|---|---|---|---|
| count | 3182.0 | 3182.000000 | 3182.000000 | 3182.000000 |
| mean | 4135001.0 | 25.531606 | 31.091015 | 0.228313 |
| std | 0.0 | 3.354856 | 12.261222 | 0.300836 |
| min | 4135001.0 | 20.398505 | 18.140415 | 0.000000 |
| 25% | 4135001.0 | 22.705182 | 21.090553 | 0.000000 |
| 50% | 4135001.0 | 24.613814 | 24.618060 | 0.024653 |
| 75% | 4135001.0 | 27.920532 | 41.307840 | 0.449588 |
| max | 4135001.0 | 35.252486 | 65.545714 | 1.221652 |
a = geracao.isnull().sum()
b = clima.isnull().sum()
print(f'Quantidade de elemesntos nulo da geração:\n{a}')
print(f'Quantidade de elemesntos nulo ddo clima:\n{b}')
Quantidade de elemesntos nulo da geração: DATE_TIME 0 PLANT_ID 0 SOURCE_KEY 0 DC_POWER 0 AC_POWER 0 DAILY_YIELD 0 TOTAL_YIELD 0 dtype: int64 Quantidade de elemesntos nulo ddo clima: DATE_TIME 0 PLANT_ID 0 SOURCE_KEY 0 AMBIENT_TEMPERATURE 0 MODULE_TEMPERATURE 0 IRRADIATION 0 dtype: int64
#Convertendo as datas para DateTime
geracao['DATE_TIME'] = pd.to_datetime(geracao['DATE_TIME'],format = '%d-%m-%Y %H:%M')
clima['DATE_TIME'] = pd.to_datetime(clima['DATE_TIME'],format = '%Y-%m-%d %H:%M')
#criando colunas para tempo e data
geracao['DATE'] = geracao['DATE_TIME'].apply(lambda x:x.date())
geracao['TIME'] = geracao['DATE_TIME'].apply(lambda x:x.time())
clima['DATE'] = clima['DATE_TIME'].apply(lambda x:x.date())
clima['TIME'] = clima['DATE_TIME'].apply(lambda x:x.time())
geracao.tail()
clima.tail()
| DATE_TIME | PLANT_ID | SOURCE_KEY | AMBIENT_TEMPERATURE | MODULE_TEMPERATURE | IRRADIATION | DATE | TIME | |
|---|---|---|---|---|---|---|---|---|
| 3177 | 2020-06-17 22:45:00 | 4135001 | HmiyD2TTLFNqkNe | 22.150570 | 21.480377 | 0.0 | 2020-06-17 | 22:45:00 |
| 3178 | 2020-06-17 23:00:00 | 4135001 | HmiyD2TTLFNqkNe | 22.129816 | 21.389024 | 0.0 | 2020-06-17 | 23:00:00 |
| 3179 | 2020-06-17 23:15:00 | 4135001 | HmiyD2TTLFNqkNe | 22.008275 | 20.709211 | 0.0 | 2020-06-17 | 23:15:00 |
| 3180 | 2020-06-17 23:30:00 | 4135001 | HmiyD2TTLFNqkNe | 21.969495 | 20.734963 | 0.0 | 2020-06-17 | 23:30:00 |
| 3181 | 2020-06-17 23:45:00 | 4135001 | HmiyD2TTLFNqkNe | 21.909288 | 20.427972 | 0.0 | 2020-06-17 | 23:45:00 |
#GeraçãodosMódulosFV
geracao_dia = geracao.copy()
geracao_dia = geracao_dia.groupby(['TIME','SOURCE_KEY'])['DAILY_YIELD'].mean().unstack()
plt.figure(figsize=(10,5))
geracao_dia.iloc[:,0:1].plot()
plt.title('Potência DC em um inversor Usina 1')
plt.ylabel('kWh')
plt.xlabel('Tempo')
Text(0.5, 0, 'Tempo')
<Figure size 720x360 with 0 Axes>
conv_Inv= geracao.groupby(['SOURCE_KEY']).mean()
eficiencia= conv_Inv['AC_POWER']*1000/conv_Inv['DC_POWER']
eficiencia.plot(figsize=(15,5), style='o--')
plt.axhline(eficiencia.mean(),linestyle='--',color='green')
plt.title('Eficiência dos Inversores', size=20)
plt.ylabel('Eficiência (%)')
plt.xlabel('ID dos inversores')
Text(0, 0.5, 'Eficiência %')
#potencia CC gerado pelos módulos
geracao_cc = geracao.copy()
geracao_cc = geracao_cc.groupby(['TIME','DATE'])['DC_POWER'].sum().unstack()
fig,ax=plt.subplots(ncols=2,nrows=1,dpi=200,figsize=(20,5))
ax[0].set_title('Potência DC em um inversor A da Usina 1')
ax[0].set_ylabel('kW')
ax[0].set_xlabel('Tempo')
ax[1].set_title('Potência DC em um inversor B da Usina 1')
ax[1].set_ylabel('kW')
ax[1].set_xlabel('Tempo')
geracao_cc.iloc[:,0:1].plot(ax=ax[0],linewidth = 5)
geracao_cc.iloc[:,1:2].plot(ax=ax[1],linewidth = 5,color='orange')
<AxesSubplot:title={'center':'Potência DC em um inversor B da Usina 1'}, xlabel='TIME', ylabel='kW'>
#potencia AC convertido pelo inversor
geracao_ac = geracao.copy()
geracao_ac = geracao_ac.groupby(['TIME','DATE'])['AC_POWER'].sum().unstack()
fig,ax=plt.subplots(ncols=2,nrows=1,dpi=200,figsize=(20,5))
ax[0].set_title('Potência AC em um inversor A da Usina 1')
ax[0].set_ylabel('kW')
ax[0].set_xlabel('Tempo')
ax[1].set_title('Potência AC em um inversor B da Usina 1')
ax[1].set_ylabel('kW')
ax[1].set_xlabel('Tempo')
geracao_ac.iloc[:,0:1].plot(ax=ax[0],linewidth = 5)
geracao_ac.iloc[:,1:2].plot(ax=ax[1],linewidth = 5,color='orange')
<AxesSubplot:title={'center':'Potência AC em um inversor B da Usina 1'}, xlabel='TIME', ylabel='kW'>
#Agrupando os dados pela data
geracao_diaria = geracao.groupby(['DATE_TIME'],as_index=False).sum()
geracao_diaria.head()
| DATE_TIME | PLANT_ID | DC_POWER | AC_POWER | DAILY_YIELD | TOTAL_YIELD | |
|---|---|---|---|---|---|---|
| 0 | 2020-05-15 00:00:00 | 86835021 | 0.0 | 0.0 | 0.0 | 143581676.0 |
| 1 | 2020-05-15 00:15:00 | 86835021 | 0.0 | 0.0 | 0.0 | 143581676.0 |
| 2 | 2020-05-15 00:30:00 | 86835021 | 0.0 | 0.0 | 0.0 | 143581676.0 |
| 3 | 2020-05-15 00:45:00 | 86835021 | 0.0 | 0.0 | 0.0 | 143581676.0 |
| 4 | 2020-05-15 01:00:00 | 90970022 | 0.0 | 0.0 | 0.0 | 150761642.0 |
#selecionando as variaveis de estudo
geracao_select = geracao_diaria[['DATE_TIME','DC_POWER','AC_POWER','DAILY_YIELD']]
geracao_select[45:50]
| DATE_TIME | DC_POWER | AC_POWER | DAILY_YIELD | |
|---|---|---|---|---|
| 45 | 2020-05-15 11:15:00 | 160301.226190 | 15683.713690 | 47752.761904 |
| 46 | 2020-05-15 11:30:00 | 155409.160714 | 15207.283929 | 51726.053571 |
| 47 | 2020-05-15 11:45:00 | 167668.196427 | 16401.589286 | 55271.107142 |
| 48 | 2020-05-15 12:00:00 | 155821.696428 | 15250.808333 | 59620.946429 |
| 49 | 2020-05-15 12:15:00 | 209569.398819 | 20477.017856 | 63932.303572 |
#drop da chave id da usina e do inversor que serão insgnificantes para a predição
clima_drop = clima.drop(['PLANT_ID', 'SOURCE_KEY'], axis=1)
clima_drop.head()
| DATE_TIME | AMBIENT_TEMPERATURE | MODULE_TEMPERATURE | IRRADIATION | DATE | TIME | |
|---|---|---|---|---|---|---|
| 0 | 2020-05-15 00:00:00 | 25.184316 | 22.857507 | 0.0 | 2020-05-15 | 00:00:00 |
| 1 | 2020-05-15 00:15:00 | 25.084589 | 22.761668 | 0.0 | 2020-05-15 | 00:15:00 |
| 2 | 2020-05-15 00:30:00 | 24.935753 | 22.592306 | 0.0 | 2020-05-15 | 00:30:00 |
| 3 | 2020-05-15 00:45:00 | 24.846130 | 22.360852 | 0.0 | 2020-05-15 | 00:45:00 |
| 4 | 2020-05-15 01:00:00 | 24.621525 | 22.165423 | 0.0 | 2020-05-15 | 01:00:00 |
#juntando dados de geração e clima
usine = pd.merge(geracao_select,clima_drop, how='inner', on='DATE_TIME')
usine_no_time = usine.drop(['DATE','TIME'],axis =1)
usine_no_time.head()
| DATE_TIME | DC_POWER | AC_POWER | DAILY_YIELD | AMBIENT_TEMPERATURE | MODULE_TEMPERATURE | IRRADIATION | |
|---|---|---|---|---|---|---|---|
| 0 | 2020-05-15 00:00:00 | 0.0 | 0.0 | 0.0 | 25.184316 | 22.857507 | 0.0 |
| 1 | 2020-05-15 00:15:00 | 0.0 | 0.0 | 0.0 | 25.084589 | 22.761668 | 0.0 |
| 2 | 2020-05-15 00:30:00 | 0.0 | 0.0 | 0.0 | 24.935753 | 22.592306 | 0.0 |
| 3 | 2020-05-15 00:45:00 | 0.0 | 0.0 | 0.0 | 24.846130 | 22.360852 | 0.0 |
| 4 | 2020-05-15 01:00:00 | 0.0 | 0.0 | 0.0 | 24.621525 | 22.165423 | 0.0 |
#insight da relação entre as variaveis
sns.pairplot(usine[['DC_POWER','AC_POWER','DAILY_YIELD','AMBIENT_TEMPERATURE','MODULE_TEMPERATURE','IRRADIATION']])
<seaborn.axisgrid.PairGrid at 0x247a35bbd30>
#observando o comportamento das variaveis de clima e da geração dos módulos FV
usine_clima = usine.copy()
clima_cc = usine_clima.groupby(['TIME']).mean()
fig,ax=plt.subplots(ncols=2,nrows=2,dpi=200,figsize=(15,5))
clima_cc['IRRADIATION'].plot(ax=ax[0,0])
clima_cc['AMBIENT_TEMPERATURE'].plot(ax=ax[0,1])
clima_cc['MODULE_TEMPERATURE'].plot(ax=ax[1,0])
clima_cc['DC_POWER'].plot(ax=ax[1,1])
ax[0,0].set_ylabel('IRRADIATION')
ax[0,1].set_ylabel('AMBIENT TEMPERATURE')
ax[1,0].set_ylabel('MODULE TEMPERATURE')
ax[1,1].set_ylabel('DC POWER')
Text(0, 0.5, 'DC POWER')
#Correlação entre as variaveis da usina para a escolha da mais apropriada para geração DC
usine_no_time.columns = ['DATE_TIME','DC_POWER','AC_POWER','DAILY_YIELD','AMBIENT','MODULE','IRRADIATION']
one_correlation = usine_no_time[['DC_POWER','AC_POWER','DAILY_YIELD','AMBIENT','MODULE','IRRADIATION']]
corr = one_correlation.corr()
fig_dims = (2, 2)
sns.heatmap(round(corr,2), annot=True, mask=(np.triu(corr,+1)))
plt.savefig('correla.png',format = 'png')
#após a escolha das variaveis com maior correlação, separação final da base de dados
base = usine[['DC_POWER','MODULE_TEMPERATURE','IRRADIATION']]
base.describe()
resultados = usine_one[['DC_POWER','DATE_TIME']]